Final Project

Data Collection

In [1]:
rm(list=ls())
In [2]:
options(warn=-1)
library(tidyverse)
-- Attaching packages --------------------------------------- tidyverse 1.3.0 --

v ggplot2 3.3.2     v purrr   0.3.4
v tibble  3.0.3     v dplyr   1.0.2
v tidyr   1.1.2     v stringr 1.4.0
v readr   1.3.1     v forcats 0.5.0

-- Conflicts ------------------------------------------ tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag()    masks stats::lag()

Now, we will read all the necessory CSV files and save the data which we need to use.

In [3]:
sales = read_csv("sales_train.csv") 
head(sales)
Parsed with column specification:
cols(
  date = col_character(),
  date_block_num = col_double(),
  shop_id = col_double(),
  item_id = col_double(),
  item_price = col_double(),
  item_cnt_day = col_double()
)

A tibble: 6 × 6
datedate_block_numshop_iditem_iditem_priceitem_cnt_day
<chr><dbl><dbl><dbl><dbl><dbl>
02.01.201305922154 999.00 1
03.01.2013025 2552 899.00 1
05.01.2013025 2552 899.00-1
06.01.2013025 25541709.05 1
15.01.2013025 25551099.00 1
10.01.2013025 2564 349.00 1
In [4]:
items = read_csv("items.csv")
head(items)
Parsed with column specification:
cols(
  item_name = col_character(),
  item_id = col_double(),
  item_category_id = col_double()
)

A tibble: 6 × 3
item_nameitem_iditem_category_id
<chr><dbl><dbl>
! <U+0412><U+041E> <U+0412><U+041B><U+0410><U+0421><U+0422><U+0418> <U+041D><U+0410><U+0412><U+0410><U+0416><U+0414><U+0415><U+041D><U+0418><U+042F> (<U+041F><U+041B><U+0410><U+0421><U+0422>.) D 040
!ABBYY FineReader 12 Professional Edition Full [PC, <U+0426><U+0438><U+0444><U+0440><U+043E><U+0432><U+0430><U+044F> <U+0432><U+0435><U+0440><U+0441><U+0438><U+044F>] 176
***<U+0412> <U+041B><U+0423><U+0427><U+0410><U+0425> <U+0421><U+041B><U+0410><U+0412><U+042B> (UNV) D 240
***<U+0413><U+041E><U+041B><U+0423><U+0411><U+0410><U+042F> <U+0412><U+041E><U+041B><U+041D><U+0410> (Univ) D 340
***<U+041A><U+041E><U+0420><U+041E><U+0411><U+041A><U+0410> (<U+0421><U+0422><U+0415><U+041A><U+041B><U+041E>) D 440
***<U+041D><U+041E><U+0412><U+042B><U+0415> <U+0410><U+041C><U+0415><U+0420><U+0418><U+041A><U+0410><U+041D><U+0421><U+041A><U+0418><U+0415> <U+0413><U+0420><U+0410><U+0424><U+0424><U+0418><U+0422><U+0418> (UNI) D540
In [5]:
shops = read_csv("shops.csv")
head(shops)
Parsed with column specification:
cols(
  shop_name = col_character(),
  shop_id = col_double()
)

A tibble: 6 × 2
shop_nameshop_id
<chr><dbl>
!<U+042F><U+043A><U+0443><U+0442><U+0441><U+043A> <U+041E><U+0440><U+0434><U+0436><U+043E><U+043D><U+0438><U+043A><U+0438><U+0434><U+0437><U+0435>, 56 <U+0444><U+0440><U+0430><U+043D> 0
!<U+042F><U+043A><U+0443><U+0442><U+0441><U+043A> <U+0422><U+0426> "<U+0426><U+0435><U+043D><U+0442><U+0440><U+0430><U+043B><U+044C><U+043D><U+044B><U+0439>" <U+0444><U+0440><U+0430><U+043D> 1
<U+0410><U+0434><U+044B><U+0433><U+0435><U+044F> <U+0422><U+0426> "<U+041C><U+0435><U+0433><U+0430>" 2
<U+0411><U+0430><U+043B><U+0430><U+0448><U+0438><U+0445><U+0430> <U+0422><U+0420><U+041A> "<U+041E><U+043A><U+0442><U+044F><U+0431><U+0440><U+044C>-<U+041A><U+0438><U+043D><U+043E><U+043C><U+0438><U+0440>"3
<U+0412><U+043E><U+043B><U+0436><U+0441><U+043A><U+0438><U+0439> <U+0422><U+0426> "<U+0412><U+043E><U+043B><U+0433><U+0430> <U+041C><U+043E><U+043B><U+043B>" 4
<U+0412><U+043E><U+043B><U+043E><U+0433><U+0434><U+0430> <U+0422><U+0420><U+0426> "<U+041C><U+0430><U+0440><U+043C><U+0435><U+043B><U+0430><U+0434>" 5
In [6]:
item_categories = read_csv("item_categories.csv" )
head(item_categories)
Parsed with column specification:
cols(
  item_category_name = col_character(),
  item_category_id = col_double()
)

A tibble: 6 × 2
item_category_nameitem_category_id
<chr><dbl>
PC - <U+0413><U+0430><U+0440><U+043D><U+0438><U+0442><U+0443><U+0440><U+044B>/<U+041D><U+0430><U+0443><U+0448><U+043D><U+0438><U+043A><U+0438>0
<U+0410><U+043A><U+0441><U+0435><U+0441><U+0441><U+0443><U+0430><U+0440><U+044B> - PS2 1
<U+0410><U+043A><U+0441><U+0435><U+0441><U+0441><U+0443><U+0430><U+0440><U+044B> - PS3 2
<U+0410><U+043A><U+0441><U+0435><U+0441><U+0441><U+0443><U+0430><U+0440><U+044B> - PS4 3
<U+0410><U+043A><U+0441><U+0435><U+0441><U+0441><U+0443><U+0430><U+0440><U+044B> - PSP 4
<U+0410><U+043A><U+0441><U+0435><U+0441><U+0441><U+0443><U+0430><U+0440><U+044B> - PSVita 5
In [7]:
# Joining the items, shops and item_categories data to the sales data

merged_sales = sales %>% 
inner_join(items, by = "item_id") %>%
inner_join(shops, by = "shop_id") %>%
inner_join(item_categories, by = "item_category_id")
head(merged_sales)
A tibble: 6 × 10
datedate_block_numshop_iditem_iditem_priceitem_cnt_dayitem_nameitem_category_idshop_nameitem_category_name
<chr><dbl><dbl><dbl><dbl><dbl><chr><dbl><chr><chr>
02.01.201305922154 999.00 1<U+042F><U+0412><U+041B><U+0415><U+041D><U+0418><U+0415> 2012 (BD) 37<U+042F><U+0440><U+043E><U+0441><U+043B><U+0430><U+0432><U+043B><U+044C> <U+0422><U+0426> "<U+0410><U+043B><U+044C><U+0442><U+0430><U+0438><U+0440>"<U+041A><U+0438><U+043D><U+043E> - Blu-Ray
03.01.2013025 2552 899.00 1DEEP PURPLE The House Of Blue Light LP 58<U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0420><U+041A> "<U+0410><U+0442><U+0440><U+0438><U+0443><U+043C>" <U+041C><U+0443><U+0437><U+044B><U+043A><U+0430> - <U+0412><U+0438><U+043D><U+0438><U+043B>
05.01.2013025 2552 899.00-1DEEP PURPLE The House Of Blue Light LP 58<U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0420><U+041A> "<U+0410><U+0442><U+0440><U+0438><U+0443><U+043C>" <U+041C><U+0443><U+0437><U+044B><U+043A><U+0430> - <U+0412><U+0438><U+043D><U+0438><U+043B>
06.01.2013025 25541709.05 1DEEP PURPLE Who Do You Think We Are LP 58<U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0420><U+041A> "<U+0410><U+0442><U+0440><U+0438><U+0443><U+043C>" <U+041C><U+0443><U+0437><U+044B><U+043A><U+0430> - <U+0412><U+0438><U+043D><U+0438><U+043B>
15.01.2013025 25551099.00 1DEEP PURPLE 30 Very Best Of 2CD (<U+0424><U+0438><U+0440><U+043C>.) 56<U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0420><U+041A> "<U+0410><U+0442><U+0440><U+0438><U+0443><U+043C>" <U+041C><U+0443><U+0437><U+044B><U+043A><U+0430> - CD <U+0444><U+0438><U+0440><U+043C><U+0435><U+043D><U+043D><U+043E><U+0433><U+043E> <U+043F><U+0440><U+043E><U+0438><U+0437><U+0432><U+043E><U+0434><U+0441><U+0442><U+0432><U+0430>
10.01.2013025 2564 349.00 1DEEP PURPLE Perihelion: Live In Concert DVD (<U+041A><U+0438><U+0440>.)59<U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0420><U+041A> "<U+0410><U+0442><U+0440><U+0438><U+0443><U+043C>" <U+041C><U+0443><U+0437><U+044B><U+043A><U+0430> - <U+041C><U+0443><U+0437><U+044B><U+043A><U+0430><U+043B><U+044C><U+043D><U+043E><U+0435> <U+0432><U+0438><U+0434><U+0435><U+043E>
In [8]:
# Seprate dates into month, day, year

library(lubridate) 
merged_sales$date = dmy(merged_sales$date)

merged_sales$year = year(merged_sales$date)
merged_sales$month = month(merged_sales$date)
merged_sales$day = day(merged_sales$date)
merged_sales$weekday = weekdays(merged_sales$date)
Attaching package: 'lubridate'


The following objects are masked from 'package:base':

    date, intersect, setdiff, union


In [9]:
# Converting the columns into factors

merged_sales$year = as.factor(merged_sales$year)
merged_sales$month = as.factor(merged_sales$month)
merged_sales$weekday = as.factor(merged_sales$weekday)

merged_sales$shop_id = as.factor(merged_sales$shop_id)
merged_sales$item_id = as.factor(merged_sales$item_id)
merged_sales$item_category_id = as.factor(merged_sales$item_category_id)

str(merged_sales)
tibble [2,935,849 x 14] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ date              : Date[1:2935849], format: "2013-01-02" "2013-01-03" ...
 $ date_block_num    : num [1:2935849] 0 0 0 0 0 0 0 0 0 0 ...
 $ shop_id           : Factor w/ 60 levels "0","1","2","3",..: 60 26 26 26 26 26 26 26 26 26 ...
 $ item_id           : Factor w/ 21807 levels "0","1","2","3",..: 21792 2496 2496 2498 2499 2508 2509 2515 2515 2516 ...
 $ item_price        : num [1:2935849] 999 899 899 1709 1099 ...
 $ item_cnt_day      : num [1:2935849] 1 1 -1 1 1 1 1 1 1 3 ...
 $ item_name         : chr [1:2935849] "<U+042F><U+0412><U+041B><U+0415><U+041D><U+0418><U+0415> 2012 (BD)" "DEEP PURPLE  The House Of Blue Light  LP" "DEEP PURPLE  The House Of Blue Light  LP" "DEEP PURPLE  Who Do You Think We Are  LP" ...
 $ item_category_id  : Factor w/ 84 levels "0","1","2","3",..: 38 59 59 59 57 60 57 56 56 56 ...
 $ shop_name         : chr [1:2935849] "<U+042F><U+0440><U+043E><U+0441><U+043B><U+0430><U+0432><U+043B><U+044C> <U+0422><U+0426> \"<U+0410><U+043B><U+"| __truncated__ "<U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0420><U+041A> \"<U+0410><U+0442><U+0440><U+0438><U+0443><U+043C>\"" "<U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0420><U+041A> \"<U+0410><U+0442><U+0440><U+0438><U+0443><U+043C>\"" "<U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0420><U+041A> \"<U+0410><U+0442><U+0440><U+0438><U+0443><U+043C>\"" ...
 $ item_category_name: chr [1:2935849] "<U+041A><U+0438><U+043D><U+043E> - Blu-Ray" "<U+041C><U+0443><U+0437><U+044B><U+043A><U+0430> - <U+0412><U+0438><U+043D><U+0438><U+043B>" "<U+041C><U+0443><U+0437><U+044B><U+043A><U+0430> - <U+0412><U+0438><U+043D><U+0438><U+043B>" "<U+041C><U+0443><U+0437><U+044B><U+043A><U+0430> - <U+0412><U+0438><U+043D><U+0438><U+043B>" ...
 $ year              : Factor w/ 3 levels "2013","2014",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ month             : Factor w/ 12 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ day               : int [1:2935849] 2 3 5 6 15 10 2 4 11 3 ...
 $ weekday           : Factor w/ 7 levels "Friday","Monday",..: 7 5 3 4 6 5 7 1 1 5 ...
 - attr(*, "spec")=
  .. cols(
  ..   date = col_character(),
  ..   date_block_num = col_double(),
  ..   shop_id = col_double(),
  ..   item_id = col_double(),
  ..   item_price = col_double(),
  ..   item_cnt_day = col_double()
  .. )
In [10]:
# Check for missing data 

any(is.na(merged_sales))
FALSE

This is necessory just to make sure we don't have any NAs which can cause problems in the future.

Feature Choice

We will visualise the data first in order the find out which features are important.

In [11]:
#Total items by month

aggregate(item_cnt_day~month, merged_sales, sum)
A data.frame: 12 × 2
monthitem_cnt_day
<fct><dbl>
1 359349
2 321975
3 344453
4 281573
5 277055
6 286924
7 271433
8 294091
9 305383
10306019
11247854
12352097
In [12]:
# This will give number of distinct shops 

no_of_shops = merged_sales %>% 
select(shop_id) %>% 
distinct() %>% 
summarise(shops=n())

no_of_shops
A tibble: 1 × 1
shops
<int>
60
In [13]:
# This will give number of times a shop apperars in the data set.

shop_frequency = merged_sales %>% 
group_by(shop_id,shop_name) %>% 
summarise(number=n())

shop_frequency
`summarise()` regrouping output by 'shop_id' (override with `.groups` argument)

A grouped_df: 60 × 3
shop_idshop_namenumber
<fct><chr><int>
0 !<U+042F><U+043A><U+0443><U+0442><U+0441><U+043A> <U+041E><U+0440><U+0434><U+0436><U+043E><U+043D><U+0438><U+043A><U+0438><U+0434><U+0437><U+0435>, 56 <U+0444><U+0440><U+0430><U+043D> 9857
1 !<U+042F><U+043A><U+0443><U+0442><U+0441><U+043A> <U+0422><U+0426> "<U+0426><U+0435><U+043D><U+0442><U+0440><U+0430><U+043B><U+044C><U+043D><U+044B><U+0439>" <U+0444><U+0440><U+0430><U+043D> 5678
2 <U+0410><U+0434><U+044B><U+0433><U+0435><U+044F> <U+0422><U+0426> "<U+041C><U+0435><U+0433><U+0430>" 25991
3 <U+0411><U+0430><U+043B><U+0430><U+0448><U+0438><U+0445><U+0430> <U+0422><U+0420><U+041A> "<U+041E><U+043A><U+0442><U+044F><U+0431><U+0440><U+044C>-<U+041A><U+0438><U+043D><U+043E><U+043C><U+0438><U+0440>" 25532
4 <U+0412><U+043E><U+043B><U+0436><U+0441><U+043A><U+0438><U+0439> <U+0422><U+0426> "<U+0412><U+043E><U+043B><U+0433><U+0430> <U+041C><U+043E><U+043B><U+043B>" 38242
5 <U+0412><U+043E><U+043B><U+043E><U+0433><U+0434><U+0430> <U+0422><U+0420><U+0426> "<U+041C><U+0430><U+0440><U+043C><U+0435><U+043B><U+0430><U+0434>" 38179
6 <U+0412><U+043E><U+0440><U+043E><U+043D><U+0435><U+0436> (<U+041F><U+043B><U+0435><U+0445><U+0430><U+043D><U+043E><U+0432><U+0441><U+043A><U+0430><U+044F>, 13) 82663
7 <U+0412><U+043E><U+0440><U+043E><U+043D><U+0435><U+0436> <U+0422><U+0420><U+0426> "<U+041C><U+0430><U+043A><U+0441><U+0438><U+043C><U+0438><U+0440>" 58076
8 <U+0412><U+043E><U+0440><U+043E><U+043D><U+0435><U+0436> <U+0422><U+0420><U+0426> <U+0421><U+0438><U+0442><U+0438>-<U+041F><U+0430><U+0440><U+043A> "<U+0413><U+0440><U+0430><U+0434>" 3412
9 <U+0412><U+044B><U+0435><U+0437><U+0434><U+043D><U+0430><U+044F> <U+0422><U+043E><U+0440><U+0433><U+043E><U+0432><U+043B><U+044F> 3751
10<U+0416><U+0443><U+043A><U+043E><U+0432><U+0441><U+043A><U+0438><U+0439> <U+0443><U+043B>. <U+0427><U+043A><U+0430><U+043B><U+043E><U+0432><U+0430> 39<U+043C>? 21397
11<U+0416><U+0443><U+043A><U+043E><U+0432><U+0441><U+043A><U+0438><U+0439> <U+0443><U+043B>. <U+0427><U+043A><U+0430><U+043B><U+043E><U+0432><U+0430> 39<U+043C>² 499
12<U+0418><U+043D><U+0442><U+0435><U+0440><U+043D><U+0435><U+0442>-<U+043C><U+0430><U+0433><U+0430><U+0437><U+0438><U+043D> <U+0427><U+0421> 34694
13<U+041A><U+0430><U+0437><U+0430><U+043D><U+044C> <U+0422><U+0426> "<U+0411><U+0435><U+0445><U+0435><U+0442><U+043B><U+0435>" 17824
14<U+041A><U+0430><U+0437><U+0430><U+043D><U+044C> <U+0422><U+0426> "<U+041F><U+0430><U+0440><U+043A><U+0425><U+0430><U+0443><U+0441>" II 36979
15<U+041A><U+0430><U+043B><U+0443><U+0433><U+0430> <U+0422><U+0420><U+0426> "XXI <U+0432><U+0435><U+043A>" 59511
16<U+041A><U+043E><U+043B><U+043E><U+043C><U+043D><U+0430> <U+0422><U+0426> "<U+0420><U+0438><U+043E>" 52734
17<U+041A><U+0440><U+0430><U+0441><U+043D><U+043E><U+044F><U+0440><U+0441><U+043A> <U+0422><U+0426> "<U+0412><U+0437><U+043B><U+0435><U+0442><U+043A><U+0430> <U+041F><U+043B><U+0430><U+0437><U+0430>" 22950
18<U+041A><U+0440><U+0430><U+0441><U+043D><U+043E><U+044F><U+0440><U+0441><U+043A> <U+0422><U+0426> "<U+0418><U+044E><U+043D><U+044C>" 53227
19<U+041A><U+0443><U+0440><U+0441><U+043A> <U+0422><U+0426> "<U+041F><U+0443><U+0448><U+043A><U+0438><U+043D><U+0441><U+043A><U+0438><U+0439>" 63911
20<U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> "<U+0420><U+0430><U+0441><U+043F><U+0440><U+043E><U+0434><U+0430><U+0436><U+0430>" 1792
21<U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+041C><U+0422><U+0420><U+0426> "<U+0410><U+0444><U+0438> <U+041C><U+043E><U+043B><U+043B>" 58133
22<U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+041C><U+0430><U+0433><U+0430><U+0437><U+0438><U+043D> <U+0421>21 45434
23<U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+041A> "<U+0411><U+0443><U+0434><U+0435><U+043D><U+043E><U+0432><U+0441><U+043A><U+0438><U+0439>" (<U+043F><U+0430><U+0432>.<U+0410>2) 6963
24<U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+041A> "<U+0411><U+0443><U+0434><U+0435><U+043D><U+043E><U+0432><U+0441><U+043A><U+0438><U+0439>" (<U+043F><U+0430><U+0432>.<U+041A>7) 53032
25<U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0420><U+041A> "<U+0410><U+0442><U+0440><U+0438><U+0443><U+043C>" 186104
26<U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0426> "<U+0410><U+0440><U+0435><U+0430><U+043B>" (<U+0411><U+0435><U+043B><U+044F><U+0435><U+0432><U+043E>) 53910
27<U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0426> "<U+041C><U+0415><U+0413><U+0410> <U+0411><U+0435><U+043B><U+0430><U+044F> <U+0414><U+0430><U+0447><U+0430> II" 105366
28<U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0426> "<U+041C><U+0415><U+0413><U+0410> <U+0422><U+0435><U+043F><U+043B><U+044B><U+0439> <U+0421><U+0442><U+0430><U+043D>" II 142234
29<U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0426> "<U+041D><U+043E><U+0432><U+044B><U+0439> <U+0432><U+0435><U+043A>" (<U+041D><U+043E><U+0432><U+043E><U+043A><U+043E><U+0441><U+0438><U+043D><U+043E>) 49225
30<U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0426> "<U+041F><U+0435><U+0440><U+043B><U+043E><U+0432><U+0441><U+043A><U+0438><U+0439>" 50860
31<U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0426> "<U+0421><U+0435><U+043C><U+0435><U+043D><U+043E><U+0432><U+0441><U+043A><U+0438><U+0439>" 235636
32<U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0426> "<U+0421><U+0435><U+0440><U+0435><U+0431><U+0440><U+044F><U+043D><U+044B><U+0439> <U+0414><U+043E><U+043C>" 7947
33<U+041C><U+044B><U+0442><U+0438><U+0449><U+0438> <U+0422><U+0420><U+041A> "XL-3" 5027
34<U+041D>.<U+041D><U+043E><U+0432><U+0433><U+043E><U+0440><U+043E><U+0434> <U+0422><U+0420><U+0426> "<U+0420><U+0418><U+041E>" 5752
35<U+041D>.<U+041D><U+043E><U+0432><U+0433><U+043E><U+0440><U+043E><U+0434> <U+0422><U+0420><U+0426> "<U+0424><U+0430><U+043D><U+0442><U+0430><U+0441><U+0442><U+0438><U+043A><U+0430>" 58445
36<U+041D><U+043E><U+0432><U+043E><U+0441><U+0438><U+0431><U+0438><U+0440><U+0441><U+043A> <U+0422><U+0420><U+0426> "<U+0413><U+0430><U+043B><U+0435><U+0440><U+0435><U+044F> <U+041D><U+043E><U+0432><U+043E><U+0441><U+0438><U+0431><U+0438><U+0440><U+0441><U+043A>" 306
37<U+041D><U+043E><U+0432><U+043E><U+0441><U+0438><U+0431><U+0438><U+0440><U+0441><U+043A> <U+0422><U+0426> "<U+041C><U+0435><U+0433><U+0430>" 39638
38<U+041E><U+043C><U+0441><U+043A> <U+0422><U+0426> "<U+041C><U+0435><U+0433><U+0430>" 46013
39<U+0420><U+043E><U+0441><U+0442><U+043E><U+0432><U+041D><U+0430><U+0414><U+043E><U+043D><U+0443> <U+0422><U+0420><U+041A> "<U+041C><U+0435><U+0433><U+0430><U+0446><U+0435><U+043D><U+0442><U+0440> <U+0413><U+043E><U+0440><U+0438><U+0437><U+043E><U+043D><U+0442>" 13440
40<U+0420><U+043E><U+0441><U+0442><U+043E><U+0432><U+041D><U+0430><U+0414><U+043E><U+043D><U+0443> <U+0422><U+0420><U+041A> "<U+041C><U+0435><U+0433><U+0430><U+0446><U+0435><U+043D><U+0442><U+0440> <U+0413><U+043E><U+0440><U+0438><U+0437><U+043E><U+043D><U+0442>" <U+041E><U+0441><U+0442><U+0440><U+043E><U+0432><U+043D><U+043E><U+0439> 4257
41<U+0420><U+043E><U+0441><U+0442><U+043E><U+0432><U+041D><U+0430><U+0414><U+043E><U+043D><U+0443> <U+0422><U+0426> "<U+041C><U+0435><U+0433><U+0430>" 41967
42<U+0421><U+041F><U+0431> <U+0422><U+041A> "<U+041D><U+0435><U+0432><U+0441><U+043A><U+0438><U+0439> <U+0426><U+0435><U+043D><U+0442><U+0440>" 109253
43<U+0421><U+041F><U+0431> <U+0422><U+041A> "<U+0421><U+0435><U+043D><U+043D><U+0430><U+044F>" 39282
44<U+0421><U+0430><U+043C><U+0430><U+0440><U+0430> <U+0422><U+0426> "<U+041C><U+0435><U+043B><U+043E><U+0434><U+0438><U+044F>" 39530
45<U+0421><U+0430><U+043C><U+0430><U+0440><U+0430> <U+0422><U+0426> "<U+041F><U+0430><U+0440><U+043A><U+0425><U+0430><U+0443><U+0441>" 35891
46<U+0421><U+0435><U+0440><U+0433><U+0438><U+0435><U+0432> <U+041F><U+043E><U+0441><U+0430><U+0434> <U+0422><U+0426> "7<U+042F>" 66321
47<U+0421><U+0443><U+0440><U+0433><U+0443><U+0442> <U+0422><U+0420><U+0426> "<U+0421><U+0438><U+0442><U+0438> <U+041C><U+043E><U+043B><U+043B>" 56695
48<U+0422><U+043E><U+043C><U+0441><U+043A> <U+0422><U+0420><U+0426> "<U+0418><U+0437><U+0443><U+043C><U+0440><U+0443><U+0434><U+043D><U+044B><U+0439> <U+0413><U+043E><U+0440><U+043E><U+0434>" 21612
49<U+0422><U+044E><U+043C><U+0435><U+043D><U+044C> <U+0422><U+0420><U+0426> "<U+041A><U+0440><U+0438><U+0441><U+0442><U+0430><U+043B><U+043B>" 15849
50<U+0422><U+044E><U+043C><U+0435><U+043D><U+044C> <U+0422><U+0426> "<U+0413><U+0443><U+0434><U+0432><U+0438><U+043D>" 65173
51<U+0422><U+044E><U+043C><U+0435><U+043D><U+044C> <U+0422><U+0426> "<U+0417><U+0435><U+043B><U+0435><U+043D><U+044B><U+0439> <U+0411><U+0435><U+0440><U+0435><U+0433>" 44433
52<U+0423><U+0444><U+0430> <U+0422><U+041A> "<U+0426><U+0435><U+043D><U+0442><U+0440><U+0430><U+043B><U+044C><U+043D><U+044B><U+0439>" 43502
53<U+0423><U+0444><U+0430> <U+0422><U+0426> "<U+0421><U+0435><U+043C><U+044C><U+044F>" 2 52921
54<U+0425><U+0438><U+043C><U+043A><U+0438> <U+0422><U+0426> "<U+041C><U+0435><U+0433><U+0430>" 143480
55<U+0426><U+0438><U+0444><U+0440><U+043E><U+0432><U+043E><U+0439> <U+0441><U+043A><U+043B><U+0430><U+0434> 1<U+0421>-<U+041E><U+043D><U+043B><U+0430><U+0439><U+043D> 34769
56<U+0427><U+0435><U+0445><U+043E><U+0432> <U+0422><U+0420><U+0426> "<U+041A><U+0430><U+0440><U+043D><U+0430><U+0432><U+0430><U+043B>" 69573
57<U+042F><U+043A><U+0443><U+0442><U+0441><U+043A> <U+041E><U+0440><U+0434><U+0436><U+043E><U+043D><U+0438><U+043A><U+0438><U+0434><U+0437><U+0435>, 56 117428
58<U+042F><U+043A><U+0443><U+0442><U+0441><U+043A> <U+0422><U+0426> "<U+0426><U+0435><U+043D><U+0442><U+0440><U+0430><U+043B><U+044C><U+043D><U+044B><U+0439>" 71441
59<U+042F><U+0440><U+043E><U+0441><U+043B><U+0430><U+0432><U+043B><U+044C> <U+0422><U+0426> "<U+0410><U+043B><U+044C><U+0442><U+0430><U+0438><U+0440>" 42108
In [14]:
# The will give the most popular shop (i.e. by sales volume)

popular_shop = merged_sales %>% 
group_by(shop_id,shop_name) %>%
summarise(total = sum(item_cnt_day)) %>%
arrange(desc(total))

popular_shop
`summarise()` regrouping output by 'shop_id' (override with `.groups` argument)

A grouped_df: 60 × 3
shop_idshop_nametotal
<fct><chr><dbl>
31<U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0426> "<U+0421><U+0435><U+043C><U+0435><U+043D><U+043E><U+0432><U+0441><U+043A><U+0438><U+0439>" 310777
25<U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0420><U+041A> "<U+0410><U+0442><U+0440><U+0438><U+0443><U+043C>" 241920
54<U+0425><U+0438><U+043C><U+043A><U+0438> <U+0422><U+0426> "<U+041C><U+0435><U+0433><U+0430>" 185790
28<U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0426> "<U+041C><U+0415><U+0413><U+0410> <U+0422><U+0435><U+043F><U+043B><U+044B><U+0439> <U+0421><U+0442><U+0430><U+043D>" II 184557
42<U+0421><U+041F><U+0431> <U+0422><U+041A> "<U+041D><U+0435><U+0432><U+0441><U+043A><U+0438><U+0439> <U+0426><U+0435><U+043D><U+0442><U+0440>" 144934
57<U+042F><U+043A><U+0443><U+0442><U+0441><U+043A> <U+041E><U+0440><U+0434><U+0436><U+043E><U+043D><U+0438><U+043A><U+0438><U+0434><U+0437><U+0435>, 56 141107
27<U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0426> "<U+041C><U+0415><U+0413><U+0410> <U+0411><U+0435><U+043B><U+0430><U+044F> <U+0414><U+0430><U+0447><U+0430> II" 136657
6 <U+0412><U+043E><U+0440><U+043E><U+043D><U+0435><U+0436> (<U+041F><U+043B><U+0435><U+0445><U+0430><U+043D><U+043E><U+0432><U+0441><U+043A><U+0430><U+044F>, 13) 100489
58<U+042F><U+043A><U+0443><U+0442><U+0441><U+043A> <U+0422><U+0426> "<U+0426><U+0435><U+043D><U+0442><U+0440><U+0430><U+043B><U+044C><U+043D><U+044B><U+0439>" 81734
46<U+0421><U+0435><U+0440><U+0433><U+0438><U+0435><U+0432> <U+041F><U+043E><U+0441><U+0430><U+0434> <U+0422><U+0426> "7<U+042F>" 78990
56<U+0427><U+0435><U+0445><U+043E><U+0432> <U+0422><U+0420><U+0426> "<U+041A><U+0430><U+0440><U+043D><U+0430><U+0432><U+0430><U+043B>" 78079
50<U+0422><U+044E><U+043C><U+0435><U+043D><U+044C> <U+0422><U+0426> "<U+0413><U+0443><U+0434><U+0432><U+0438><U+043D>" 76238
12<U+0418><U+043D><U+0442><U+0435><U+0440><U+043D><U+0435><U+0442>-<U+043C><U+0430><U+0433><U+0430><U+0437><U+0438><U+043D> <U+0427><U+0421> 73478
19<U+041A><U+0443><U+0440><U+0441><U+043A> <U+0422><U+0426> "<U+041F><U+0443><U+0448><U+043A><U+0438><U+043D><U+0441><U+043A><U+0438><U+0439>" 73455
15<U+041A><U+0430><U+043B><U+0443><U+0433><U+0430> <U+0422><U+0420><U+0426> "XXI <U+0432><U+0435><U+043A>" 71201
35<U+041D>.<U+041D><U+043E><U+0432><U+0433><U+043E><U+0440><U+043E><U+0434> <U+0422><U+0420><U+0426> "<U+0424><U+0430><U+043D><U+0442><U+0430><U+0441><U+0442><U+0438><U+043A><U+0430>" 69016
21<U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+041C><U+0422><U+0420><U+0426> "<U+0410><U+0444><U+0438> <U+041C><U+043E><U+043B><U+043B>" 68560
26<U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0426> "<U+0410><U+0440><U+0435><U+0430><U+043B>" (<U+0411><U+0435><U+043B><U+044F><U+0435><U+0432><U+043E>) 67890
47<U+0421><U+0443><U+0440><U+0433><U+0443><U+0442> <U+0422><U+0420><U+0426> "<U+0421><U+0438><U+0442><U+0438> <U+041C><U+043E><U+043B><U+043B>" 67637
7 <U+0412><U+043E><U+0440><U+043E><U+043D><U+0435><U+0436> <U+0422><U+0420><U+0426> "<U+041C><U+0430><U+043A><U+0441><U+0438><U+043C><U+0438><U+0440>" 67058
18<U+041A><U+0440><U+0430><U+0441><U+043D><U+043E><U+044F><U+0440><U+0441><U+043A> <U+0422><U+0426> "<U+0418><U+044E><U+043D><U+044C>" 65486
24<U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+041A> "<U+0411><U+0443><U+0434><U+0435><U+043D><U+043E><U+0432><U+0441><U+043A><U+0438><U+0439>" (<U+043F><U+0430><U+0432>.<U+041A>7) 63886
55<U+0426><U+0438><U+0444><U+0440><U+043E><U+0432><U+043E><U+0439> <U+0441><U+043A><U+043B><U+0430><U+0434> 1<U+0421>-<U+041E><U+043D><U+043B><U+0430><U+0439><U+043D> 63388
53<U+0423><U+0444><U+0430> <U+0422><U+0426> "<U+0421><U+0435><U+043C><U+044C><U+044F>" 2 61657
16<U+041A><U+043E><U+043B><U+043E><U+043C><U+043D><U+0430> <U+0422><U+0426> "<U+0420><U+0438><U+043E>" 61633
30<U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0426> "<U+041F><U+0435><U+0440><U+043B><U+043E><U+0432><U+0441><U+043A><U+0438><U+0439>" 60828
22<U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+041C><U+0430><U+0433><U+0430><U+0437><U+0438><U+043D> <U+0421>21 60230
29<U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0426> "<U+041D><U+043E><U+0432><U+044B><U+0439> <U+0432><U+0435><U+043A>" (<U+041D><U+043E><U+0432><U+043E><U+043A><U+043E><U+0441><U+0438><U+043D><U+043E>) 58713
38<U+041E><U+043C><U+0441><U+043A> <U+0422><U+0426> "<U+041C><U+0435><U+0433><U+0430>" 53886
43<U+0421><U+041F><U+0431> <U+0422><U+041A> "<U+0421><U+0435><U+043D><U+043D><U+0430><U+044F>" 50608
52<U+0423><U+0444><U+0430> <U+0422><U+041A> "<U+0426><U+0435><U+043D><U+0442><U+0440><U+0430><U+043B><U+044C><U+043D><U+044B><U+0439>" 49744
41<U+0420><U+043E><U+0441><U+0442><U+043E><U+0432><U+041D><U+0430><U+0414><U+043E><U+043D><U+0443> <U+0422><U+0426> "<U+041C><U+0435><U+0433><U+0430>" 49324
59<U+042F><U+0440><U+043E><U+0441><U+043B><U+0430><U+0432><U+043B><U+044C> <U+0422><U+0426> "<U+0410><U+043B><U+044C><U+0442><U+0430><U+0438><U+0440>" 48993
51<U+0422><U+044E><U+043C><U+0435><U+043D><U+044C> <U+0422><U+0426> "<U+0417><U+0435><U+043B><U+0435><U+043D><U+044B><U+0439> <U+0411><U+0435><U+0440><U+0435><U+0433>" 48767
14<U+041A><U+0430><U+0437><U+0430><U+043D><U+044C> <U+0422><U+0426> "<U+041F><U+0430><U+0440><U+043A><U+0425><U+0430><U+0443><U+0441>" II 46375
37<U+041D><U+043E><U+0432><U+043E><U+0441><U+0438><U+0431><U+0438><U+0440><U+0441><U+043A> <U+0422><U+0426> "<U+041C><U+0435><U+0433><U+0430>" 46256
44<U+0421><U+0430><U+043C><U+0430><U+0440><U+0430> <U+0422><U+0426> "<U+041C><U+0435><U+043B><U+043E><U+0434><U+0438><U+044F>" 44938
4 <U+0412><U+043E><U+043B><U+0436><U+0441><U+043A><U+0438><U+0439> <U+0422><U+0426> "<U+0412><U+043E><U+043B><U+0433><U+0430> <U+041C><U+043E><U+043B><U+043B>" 43942
5 <U+0412><U+043E><U+043B><U+043E><U+0433><U+0434><U+0430> <U+0422><U+0420><U+0426> "<U+041C><U+0430><U+0440><U+043C><U+0435><U+043B><U+0430><U+0434>" 42762
45<U+0421><U+0430><U+043C><U+0430><U+0440><U+0430> <U+0422><U+0426> "<U+041F><U+0430><U+0440><U+043A><U+0425><U+0430><U+0443><U+0441>" 41895
2 <U+0410><U+0434><U+044B><U+0433><U+0435><U+044F> <U+0422><U+0426> "<U+041C><U+0435><U+0433><U+0430>" 30620
3 <U+0411><U+0430><U+043B><U+0430><U+0448><U+0438><U+0445><U+0430> <U+0422><U+0420><U+041A> "<U+041E><U+043A><U+0442><U+044F><U+0431><U+0440><U+044C>-<U+041A><U+0438><U+043D><U+043E><U+043C><U+0438><U+0440>" 28355
17<U+041A><U+0440><U+0430><U+0441><U+043D><U+043E><U+044F><U+0440><U+0441><U+043A> <U+0422><U+0426> "<U+0412><U+0437><U+043B><U+0435><U+0442><U+043A><U+0430> <U+041F><U+043B><U+0430><U+0437><U+0430>" 25838
48<U+0422><U+043E><U+043C><U+0441><U+043A> <U+0422><U+0420><U+0426> "<U+0418><U+0437><U+0443><U+043C><U+0440><U+0443><U+0434><U+043D><U+044B><U+0439> <U+0413><U+043E><U+0440><U+043E><U+0434>" 24909
10<U+0416><U+0443><U+043A><U+043E><U+0432><U+0441><U+043A><U+0438><U+0439> <U+0443><U+043B>. <U+0427><U+043A><U+0430><U+043B><U+043E><U+0432><U+0430> 39<U+043C>? 24523
13<U+041A><U+0430><U+0437><U+0430><U+043D><U+044C> <U+0422><U+0426> "<U+0411><U+0435><U+0445><U+0435><U+0442><U+043B><U+0435>" 19763
49<U+0422><U+044E><U+043C><U+0435><U+043D><U+044C> <U+0422><U+0420><U+0426> "<U+041A><U+0440><U+0438><U+0441><U+0442><U+0430><U+043B><U+043B>" 17090
39<U+0420><U+043E><U+0441><U+0442><U+043E><U+0432><U+041D><U+0430><U+0414><U+043E><U+043D><U+0443> <U+0422><U+0420><U+041A> "<U+041C><U+0435><U+0433><U+0430><U+0446><U+0435><U+043D><U+0442><U+0440> <U+0413><U+043E><U+0440><U+0438><U+0437><U+043E><U+043D><U+0442>" 16617
9 <U+0412><U+044B><U+0435><U+0437><U+0434><U+043D><U+0430><U+044F> <U+0422><U+043E><U+0440><U+0433><U+043E><U+0432><U+043B><U+044F> 15866
0 !<U+042F><U+043A><U+0443><U+0442><U+0441><U+043A> <U+041E><U+0440><U+0434><U+0436><U+043E><U+043D><U+0438><U+043A><U+0438><U+0434><U+0437><U+0435>, 56 <U+0444><U+0440><U+0430><U+043D> 11705
32<U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0426> "<U+0421><U+0435><U+0440><U+0435><U+0431><U+0440><U+044F><U+043D><U+044B><U+0439> <U+0414><U+043E><U+043C>" 8781
23<U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+041A> "<U+0411><U+0443><U+0434><U+0435><U+043D><U+043E><U+0432><U+0441><U+043A><U+0438><U+0439>" (<U+043F><U+0430><U+0432>.<U+0410>2) 7705
34<U+041D>.<U+041D><U+043E><U+0432><U+0433><U+043E><U+0440><U+043E><U+0434> <U+0422><U+0420><U+0426> "<U+0420><U+0418><U+041E>" 6451
1 !<U+042F><U+043A><U+0443><U+0442><U+0441><U+043A> <U+0422><U+0426> "<U+0426><U+0435><U+043D><U+0442><U+0440><U+0430><U+043B><U+044C><U+043D><U+044B><U+0439>" <U+0444><U+0440><U+0430><U+043D> 6311
20<U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> "<U+0420><U+0430><U+0441><U+043F><U+0440><U+043E><U+0434><U+0430><U+0436><U+0430>" 5872
33<U+041C><U+044B><U+0442><U+0438><U+0449><U+0438> <U+0422><U+0420><U+041A> "XL-3" 5482
40<U+0420><U+043E><U+0441><U+0442><U+043E><U+0432><U+041D><U+0430><U+0414><U+043E><U+043D><U+0443> <U+0422><U+0420><U+041A> "<U+041C><U+0435><U+0433><U+0430><U+0446><U+0435><U+043D><U+0442><U+0440> <U+0413><U+043E><U+0440><U+0438><U+0437><U+043E><U+043D><U+0442>" <U+041E><U+0441><U+0442><U+0440><U+043E><U+0432><U+043D><U+043E><U+0439> 4943
8 <U+0412><U+043E><U+0440><U+043E><U+043D><U+0435><U+0436> <U+0422><U+0420><U+0426> <U+0421><U+0438><U+0442><U+0438>-<U+041F><U+0430><U+0440><U+043A> "<U+0413><U+0440><U+0430><U+0434>" 3595
11<U+0416><U+0443><U+043A><U+043E><U+0432><U+0441><U+043A><U+0438><U+0439> <U+0443><U+043B>. <U+0427><U+043A><U+0430><U+043B><U+043E><U+0432><U+0430> 39<U+043C>² 572
36<U+041D><U+043E><U+0432><U+043E><U+0441><U+0438><U+0431><U+0438><U+0440><U+0441><U+043A> <U+0422><U+0420><U+0426> "<U+0413><U+0430><U+043B><U+0435><U+0440><U+0435><U+044F> <U+041D><U+043E><U+0432><U+043E><U+0441><U+0438><U+0431><U+0438><U+0440><U+0441><U+043A>" 330
In [15]:
library(plotly)
Attaching package: 'plotly'


The following object is masked from 'package:ggplot2':

    last_plot


The following object is masked from 'package:stats':

    filter


The following object is masked from 'package:graphics':

    layout


In [16]:
# Plot for the shop id vs total quantity sold

plot_ly(popular_shop, x=~total, y=~reorder(as.factor(shop_id) , total), 
        type="bar", color=~shop_id, hoverinfo='text', text=~paste(shop_name,':',total), orientation='h') %>%
layout(title="Most Popular Shop by Total Quantity Sold.", xaxis=list(title="Total Quantity Sold"), 
       yaxis=list(title="Shop ID"),legend=list(title=list(text='<b> Shop ID </b>')))
In [17]:
# Unique number of items 

unique_items = merged_sales %>% 
distinct(item_id) %>%
summarise(items=n())

unique_items
A tibble: 1 × 1
items
<int>
21807
In [18]:
item_freq = merged_sales %>% 
group_by(item_id,item_name) %>%
summarise(freq =n()) %>%
arrange(desc(freq))

head(item_freq)
`summarise()` regrouping output by 'item_id' (override with `.groups` argument)

A grouped_df: 6 × 3
item_iditem_namefreq
<fct><chr><int>
20949<U+0424><U+0438><U+0440><U+043C><U+0435><U+043D><U+043D><U+044B><U+0439> <U+043F><U+0430><U+043A><U+0435><U+0442> <U+043C><U+0430><U+0439><U+043A><U+0430> 1<U+0421> <U+0418><U+043D><U+0442><U+0435><U+0440><U+0435><U+0441> <U+0431><U+0435><U+043B><U+044B><U+0439> (34*42) 45 <U+043C><U+043A><U+043C> 31340
5822 Playstation Store <U+043F><U+043E><U+043F><U+043E><U+043B><U+043D><U+0435><U+043D><U+0438><U+0435> <U+0431><U+0443><U+043C><U+0430><U+0436><U+043D><U+0438><U+043A><U+0430>: <U+041A><U+0430><U+0440><U+0442><U+0430> <U+043E><U+043F><U+043B><U+0430><U+0442><U+044B> 1000 <U+0440><U+0443><U+0431>. 9408
17717<U+041F><U+0440><U+0438><U+0435><U+043C> <U+0434><U+0435><U+043D><U+0435><U+0436><U+043D><U+044B><U+0445> <U+0441><U+0440><U+0435><U+0434><U+0441><U+0442><U+0432> <U+0434><U+043B><U+044F> 1<U+0421>-<U+041E><U+043D><U+043B><U+0430><U+0439><U+043D> 9067
2808 Diablo III [PC, Jewel, <U+0440><U+0443><U+0441><U+0441><U+043A><U+0430><U+044F> <U+0432><U+0435><U+0440><U+0441><U+0438><U+044F>] 7479
4181 Kaspersky Internet Security Multi-Device Russian Edition. 2-Device 1 year Renewal Box 6853
7856 World of Warcraft. <U+041A><U+0430><U+0440><U+0442><U+0430> <U+043E><U+043F><U+043B><U+0430><U+0442><U+044B> <U+0438><U+0433><U+0440><U+043E><U+0432><U+043E><U+0433><U+043E> <U+0432><U+0440><U+0435><U+043C><U+0435><U+043D><U+0438> (online) (<U+0440><U+0443><U+0441>.<U+0432>.) (60 <U+0434><U+043D><U+0435><U+0439>) (Jewel) 6602
In [19]:
# Most sold item for each shop

popular_item = merged_sales %>% 
group_by(shop_id, item_id) %>%
summarise(quantity_sold = sum(item_cnt_day)) %>%
filter(quantity_sold == max(quantity_sold))


popular_item
`summarise()` regrouping output by 'shop_id' (override with `.groups` argument)

A grouped_df: 60 × 3
shop_iditem_idquantity_sold
<fct><fct><dbl>
0 19811 79
1 13354 62
2 20949 1319
3 20949 1137
4 20949 2465
5 20949 2163
6 20949 3551
7 20949 3321
8 12168 31
9 7096 305
1020949 1413
1120949 46
1211373 6563
1320949 1023
1420949 3219
1520949 2907
1620949 3395
1720949 1377
1820949 2826
1920949 2347
2015275 141
2120949 4335
2220949 4382
234164 89
2420949 2918
252094914343
2620949 3624
2720949 8265
282094913458
2920949 2167
3020949 3922
312094919934
322808 178
3320949 379
3420949 328
3520949 3434
3620949 16
3720949 2727
3820949 2096
3920949 1867
4020949 648
4120949 2100
422094910514
4320949 2266
4420949 1060
4520949 1404
4620949 3891
4720949 4530
4820949 1619
4920949 722
5020949 2544
5120949 1074
5220949 1920
5320949 3540
542094912323
557967 2365
5620949 3439
5720949 6116
5820949 2928
5920949 2225
In [20]:
# Plot for the most quantity sold for each shop

plot_ly(popular_item,y=~shop_id, x=~quantity_sold,
        type="bar", color=~as.factor(item_id), orientation='h', hoverinfo='text', text=~paste('Shop',shop_id,':','Item',item_id)) %>%
    layout(title="Most Popular Item per Shop", yaxis=list(title="Shop"), xaxis=list(title="Quantity Sold"), legend=list(title=list(text='<b> Item ID </b>')))
In [21]:
# Repetition of categories in the data set 

item_cat_freq = merged_sales %>% 
    group_by(item_category_id,item_category_name) %>%
    summarise(freq =n()) %>%
    arrange(desc(freq))

head(item_cat_freq)
`summarise()` regrouping output by 'item_category_id' (override with `.groups` argument)

A grouped_df: 6 × 3
item_category_iditem_category_namefreq
<fct><chr><int>
40<U+041A><U+0438><U+043D><U+043E> - DVD 564652
30<U+0418><U+0433><U+0440><U+044B> PC - <U+0421><U+0442><U+0430><U+043D><U+0434><U+0430><U+0440><U+0442><U+043D><U+044B><U+0435> <U+0438><U+0437><U+0434><U+0430><U+043D><U+0438><U+044F> 351591
55<U+041C><U+0443><U+0437><U+044B><U+043A><U+0430> - CD <U+043B><U+043E><U+043A><U+0430><U+043B><U+044C><U+043D><U+043E><U+0433><U+043E> <U+043F><U+0440><U+043E><U+0438><U+0437><U+0432><U+043E><U+0434><U+0441><U+0442><U+0432><U+0430>339585
19<U+0418><U+0433><U+0440><U+044B> - PS3 208219
37<U+041A><U+0438><U+043D><U+043E> - Blu-Ray 192674
23<U+0418><U+0433><U+0440><U+044B> - XBOX 360 146789
In [22]:
# Find number of unique categories

item_cat_unique = merged_sales %>%
                select(item_category_id) %>% 
                distinct() %>% 
                count()

item_cat_unique
A tibble: 1 × 1
n
<int>
84
In [23]:
# Find number of categories per shop

categories_shop = merged_sales %>% 
                group_by(shop_id) %>%
                distinct(item_category_id) %>% 
                summarise(no_distinct_categories = n()) 

categories_shop
`summarise()` ungrouping output (override with `.groups` argument)

A tibble: 60 × 2
shop_idno_distinct_categories
<fct><int>
0 48
1 41
2 59
3 58
4 61
5 60
6 63
7 60
8 43
9 38
1057
1135
1264
1339
1460
1559
1663
1755
1862
1960
2012
2161
2261
2344
2458
2565
2661
2760
2860
2960
3059
3162
3247
3347
3449
3562
3635
3763
3863
3956
4026
4160
4262
4357
4462
4559
4660
4760
4852
4954
5059
5156
5261
5360
5459
5512
5663
5761
5858
5960
In [24]:
# Plot for shop vs number of categories for each shop

plot_ly(categories_shop, x=~no_distinct_categories, y=~reorder(as.factor(shop_id), no_distinct_categories),
       type='bar', orientation='h', color=~as.factor(shop_id), hoverinfo='text', text=~paste('Shop',shop_id,':',no_distinct_categories,'Categories')) %>%
    layout(title= "Number of categories per shop", xaxis=list(title= "Number of distinct categories"), yaxis=list(title= "Shops"), showlegend=FALSE)
In [25]:
#  Find top 10 selling products

top10_prod = merged_sales %>% 
            group_by(item_id,item_name) %>%
            summarise(qty_vol = sum(item_cnt_day)) %>%
            arrange(desc(qty_vol)) %>%
            ungroup() %>%
            top_n(10)

top10_prod
`summarise()` regrouping output by 'item_id' (override with `.groups` argument)

Selecting by qty_vol

A tibble: 10 × 3
item_iditem_nameqty_vol
<fct><chr><dbl>
20949<U+0424><U+0438><U+0440><U+043C><U+0435><U+043D><U+043D><U+044B><U+0439> <U+043F><U+0430><U+043A><U+0435><U+0442> <U+043C><U+0430><U+0439><U+043A><U+0430> 1<U+0421> <U+0418><U+043D><U+0442><U+0435><U+0440><U+0435><U+0441> <U+0431><U+0435><U+043B><U+044B><U+0439> (34*42) 45 <U+043C><U+043A><U+043C>187642
2808 Diablo III [PC, Jewel, <U+0440><U+0443><U+0441><U+0441><U+043A><U+0430><U+044F> <U+0432><U+0435><U+0440><U+0441><U+0438><U+044F>] 17245
3732 Grand Theft Auto V [PS3, <U+0440><U+0443><U+0441><U+0441><U+043A><U+0438><U+0435> <U+0441><U+0443><U+0431><U+0442><U+0438><U+0442><U+0440><U+044B>] 16642
17717<U+041F><U+0440><U+0438><U+0435><U+043C> <U+0434><U+0435><U+043D><U+0435><U+0436><U+043D><U+044B><U+0445> <U+0441><U+0440><U+0435><U+0434><U+0441><U+0442><U+0432> <U+0434><U+043B><U+044F> 1<U+0421>-<U+041E><U+043D><U+043B><U+0430><U+0439><U+043D> 15830
5822 Playstation Store <U+043F><U+043E><U+043F><U+043E><U+043B><U+043D><U+0435><U+043D><U+0438><U+0435> <U+0431><U+0443><U+043C><U+0430><U+0436><U+043D><U+0438><U+043A><U+0430>: <U+041A><U+0430><U+0440><U+0442><U+0430> <U+043E><U+043F><U+043B><U+0430><U+0442><U+044B> 1000 <U+0440><U+0443><U+0431>. 14515
3734 Grand Theft Auto V [Xbox 360, <U+0440><U+0443><U+0441><U+0441><U+043A><U+0438><U+0435> <U+0441><U+0443><U+0431><U+0442><U+0438><U+0442><U+0440><U+044B>] 11688
6675 Sony PlayStation 4 (500 Gb) Black (CUH-1008A/1108A/B01) 10289
3731 Grand Theft Auto V [PC, <U+0440><U+0443><U+0441><U+0441><U+043A><U+0438><U+0435> <U+0441><U+0443><U+0431><U+0442><U+0438><U+0442><U+0440><U+044B>] 10099
1855 Battlefield 4 [PC, <U+0440><U+0443><U+0441><U+0441><U+043A><U+0430><U+044F> <U+0432><U+0435><U+0440><U+0441><U+0438><U+044F>] 10032
16787<U+041E><U+0434><U+043D><U+0438> <U+0438><U+0437> <U+043D><U+0430><U+0441> [PS3, <U+0440><U+0443><U+0441><U+0441><U+043A><U+0430><U+044F> <U+0432><U+0435><U+0440><U+0441><U+0438><U+044F>] 9227
In [26]:
# Plot for the quantity vs item for the top 10 selling items

plot_ly(top10_prod, x=~item_name, y=~qty_vol, type='bar', color=~as.factor(item_id), hoverinfo='text', text=~paste(item_name,':',qty_vol)) %>%
    layout(title="Top 10 Selling Products", xaxis=list(title="Item ID"), yaxis=list(title="Quantity Sold"), showlegend=FALSE)
In [27]:
# Find top 10 selling categories

top10_cat = merged_sales %>% 
            group_by(item_category_id,item_category_name) %>%
            summarise(qty_vol = sum(item_cnt_day)) %>%
            arrange(desc(qty_vol)) %>%
            ungroup() %>%
            top_n(10)

top10_cat
`summarise()` regrouping output by 'item_category_id' (override with `.groups` argument)

Selecting by qty_vol

A tibble: 10 × 3
item_category_iditem_category_nameqty_vol
<fct><chr><dbl>
40<U+041A><U+0438><U+043D><U+043E> - DVD 634171
30<U+0418><U+0433><U+0440><U+044B> PC - <U+0421><U+0442><U+0430><U+043D><U+0434><U+0430><U+0440><U+0442><U+043D><U+044B><U+0435> <U+0438><U+0437><U+0434><U+0430><U+043D><U+0438><U+044F> 456540
55<U+041C><U+0443><U+0437><U+044B><U+043A><U+0430> - CD <U+043B><U+043E><U+043A><U+0430><U+043B><U+044C><U+043D><U+043E><U+0433><U+043E> <U+043F><U+0440><U+043E><U+0438><U+0437><U+0432><U+043E><U+0434><U+0441><U+0442><U+0432><U+0430> 348591
19<U+0418><U+0433><U+0440><U+044B> - PS3 254887
37<U+041A><U+0438><U+043D><U+043E> - Blu-Ray 203284
71<U+041F><U+043E><U+0434><U+0430><U+0440><U+043A><U+0438> - <U+0421><U+0443><U+043C><U+043A><U+0438>, <U+0410><U+043B><U+044C><U+0431><U+043E><U+043C><U+044B>, <U+041A><U+043E><U+0432><U+0440><U+0438><U+043A><U+0438> <U+0434>/<U+043C><U+044B><U+0448><U+0438>187998
28<U+0418><U+0433><U+0440><U+044B> PC - <U+0414><U+043E><U+043F><U+043E><U+043B><U+043D><U+0438><U+0442><U+0435><U+043B><U+044C><U+043D><U+044B><U+0435> <U+0438><U+0437><U+0434><U+0430><U+043D><U+0438><U+044F> 174954
23<U+0418><U+0433><U+0440><U+044B> - XBOX 360 169944
20<U+0418><U+0433><U+0440><U+044B> - PS4 127319
65<U+041F><U+043E><U+0434><U+0430><U+0440><U+043A><U+0438> - <U+041D><U+0430><U+0441><U+0442><U+043E><U+043B><U+044C><U+043D><U+044B><U+0435> <U+0438><U+0433><U+0440><U+044B> (<U+043A><U+043E><U+043C><U+043F><U+0430><U+043A><U+0442><U+043D><U+044B><U+0435>) 73077
In [28]:
# Plot for quantity vs categories for the top 10 selling categories

plot_ly(top10_cat, x=~item_category_name, y=~qty_vol, color=~as.factor(item_category_id), type='bar',  hoverinfo='text', text=~paste(item_category_name,':',qty_vol)) %>%
    layout(title="Top 10 Selling Categories", xaxis=list(title="Category"), yaxis=list(title="Quantity Sold"), showlegend=FALSE)
In [29]:
#  Monthly sales for each year

year_month_sales = merged_sales %>% 
                   group_by(year, month) %>%
                   summarise(sales_vol = sum(item_cnt_day * item_price)) %>%
                   arrange(year)

year_month_sales
`summarise()` regrouping output by 'year' (override with `.groups` argument)

A grouped_df: 34 × 3
yearmonthsales_vol
<fct><fct><dbl>
20131 91947091
20132 90665707
20133 104932687
20134 69154286
20135 65065312
20136 79550406
20137 69350600
20138 74297970
20139 119081892
201310107241613
201311138665439
201312207571731
20141 98226023
20142 107431123
20143 109115405
20144 80651755
20145 91281734
20146 82668539
20147 76487921
20148 89604092
20149 114723594
201410112525522
201411152037599
201412232025171
20151 118927700
20152 86593884
20153 86476267
20154 90252814
20155 88577438
20156 65397995
20157 59405787
20158 61748965
20159 92905169
201510 84337113
In [30]:
# Plot for sales by volume vs year with montly bifurcation

plot_ly(year_month_sales, x=~year, y=~sales_vol, color=~month, type='bar', hoverinfo='text', text=~paste(month.name[month],':',round(sales_vol))) %>%
        layout(title="Sales year-month Wise", xaxis=list(title="Year"), yaxis=list(title="Sales Volume"), barmode='stack', legend=list(title=list(text='<b>Month</b>')))
In [31]:
#  Total yearly sales

year_sales = merged_sales %>% 
            group_by(year) %>%
            summarise(sales_vol_year = sum(item_cnt_day * item_price)) 

year_sales
`summarise()` ungrouping output (override with `.groups` argument)

A tibble: 3 × 2
yearsales_vol_year
<fct><dbl>
20131217524734
20141346778479
2015 834623132
In [32]:
# Plot for total sales vs year

plot_ly(year_sales, x=~year, y=~sales_vol_year, type='bar', color=~year, hoverinfo='text', text=~paste(year,':',round(sales_vol_year))) %>%
    layout(title="Total sales per Year", xaxis=list(title="Year"), yaxis=list(title="Total Sales"), legend=list(title=list(text='<b>Year</b>')))
In [33]:
# Find the best selling product by quantity

prod_popularity_years = merged_sales %>% 
                        group_by(year, item_id) %>%
                        summarise(qty_vol = sum(item_cnt_day)) %>%
                        filter(qty_vol == max(qty_vol))

prod_popularity_years
`summarise()` regrouping output by 'year' (override with `.groups` argument)

A grouped_df: 3 × 3
yearitem_idqty_vol
<fct><fct><dbl>
20132094963748
20142094985288
20152094938606
In [34]:
# Plot for the quantity of the most sold product vs year

plot_ly(prod_popularity_years, x=~year, y=~qty_vol, type='bar', hoverinfo='text', text=~paste(year,', Item',item_id,':',qty_vol)) %>%
    layout(title="Most Popular Product per year", xaxis=~list(title="Year"), yaxis=~list(title="Quantity Sold"))
In [35]:
# Find the shop with highest sales volume

sales_volume = merged_sales %>% 
              group_by(shop_id,shop_name) %>%
              summarise(sales_vol = sum(item_cnt_day * item_price)) %>%
              arrange(desc(sales_vol))

sales_volume
`summarise()` regrouping output by 'shop_id' (override with `.groups` argument)

A grouped_df: 60 × 3
shop_idshop_namesales_vol
<fct><chr><dbl>
31<U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0426> "<U+0421><U+0435><U+043C><U+0435><U+043D><U+043E><U+0432><U+0441><U+043A><U+0438><U+0439>" 235217019.1
25<U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0420><U+041A> "<U+0410><U+0442><U+0440><U+0438><U+0443><U+043C>" 216480571.4
28<U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0426> "<U+041C><U+0415><U+0413><U+0410> <U+0422><U+0435><U+043F><U+043B><U+044B><U+0439> <U+0421><U+0442><U+0430><U+043D>" II 159746337.4
42<U+0421><U+041F><U+0431> <U+0422><U+041A> "<U+041D><U+0435><U+0432><U+0441><U+043A><U+0438><U+0439> <U+0426><U+0435><U+043D><U+0442><U+0440>" 151917032.5
54<U+0425><U+0438><U+043C><U+043A><U+0438> <U+0422><U+0426> "<U+041C><U+0435><U+0433><U+0430>" 141939673.0
27<U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0426> "<U+041C><U+0415><U+0413><U+0410> <U+0411><U+0435><U+043B><U+0430><U+044F> <U+0414><U+0430><U+0447><U+0430> II" 140632474.1
57<U+042F><U+043A><U+0443><U+0442><U+0441><U+043A> <U+041E><U+0440><U+0434><U+0436><U+043E><U+043D><U+0438><U+043A><U+0438><U+0434><U+0437><U+0435>, 56 112999783.5
12<U+0418><U+043D><U+0442><U+0435><U+0440><U+043D><U+0435><U+0442>-<U+043C><U+0430><U+0433><U+0430><U+0437><U+0438><U+043D> <U+0427><U+0421> 112205072.9
6 <U+0412><U+043E><U+0440><U+043E><U+043D><U+0435><U+0436> (<U+041F><U+043B><U+0435><U+0445><U+0430><U+043D><U+043E><U+0432><U+0441><U+043A><U+0430><U+044F>, 13) 98067453.8
18<U+041A><U+0440><U+0430><U+0441><U+043D><U+043E><U+044F><U+0440><U+0441><U+043A> <U+0422><U+0426> "<U+0418><U+044E><U+043D><U+044C>" 83392701.0
50<U+0422><U+044E><U+043C><U+0435><U+043D><U+044C> <U+0422><U+0426> "<U+0413><U+0443><U+0434><U+0432><U+0438><U+043D>" 77950597.5
47<U+0421><U+0443><U+0440><U+0433><U+0443><U+0442> <U+0422><U+0420><U+0426> "<U+0421><U+0438><U+0442><U+0438> <U+041C><U+043E><U+043B><U+043B>" 74882096.2
46<U+0421><U+0435><U+0440><U+0433><U+0438><U+0435><U+0432> <U+041F><U+043E><U+0441><U+0430><U+0434> <U+0422><U+0426> "7<U+042F>" 74189318.9
35<U+041D>.<U+041D><U+043E><U+0432><U+0433><U+043E><U+0440><U+043E><U+0434> <U+0422><U+0420><U+0426> "<U+0424><U+0430><U+043D><U+0442><U+0430><U+0441><U+0442><U+0438><U+043A><U+0430>" 70204774.9
24<U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+041A> "<U+0411><U+0443><U+0434><U+0435><U+043D><U+043E><U+0432><U+0441><U+043A><U+0438><U+0439>" (<U+043F><U+0430><U+0432>.<U+041A>7) 68811693.5
58<U+042F><U+043A><U+0443><U+0442><U+0441><U+043A> <U+0422><U+0426> "<U+0426><U+0435><U+043D><U+0442><U+0440><U+0430><U+043B><U+044C><U+043D><U+044B><U+0439>" 68737617.8
15<U+041A><U+0430><U+043B><U+0443><U+0433><U+0430> <U+0422><U+0420><U+0426> "XXI <U+0432><U+0435><U+043A>" 68710368.4
26<U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0426> "<U+0410><U+0440><U+0435><U+0430><U+043B>" (<U+0411><U+0435><U+043B><U+044F><U+0435><U+0432><U+043E>) 67551089.6
7 <U+0412><U+043E><U+0440><U+043E><U+043D><U+0435><U+0436> <U+0422><U+0420><U+0426> "<U+041C><U+0430><U+043A><U+0441><U+0438><U+043C><U+0438><U+0440>" 66957658.6
38<U+041E><U+043C><U+0441><U+043A> <U+0422><U+0426> "<U+041C><U+0435><U+0433><U+0430>" 65766434.4
19<U+041A><U+0443><U+0440><U+0441><U+043A> <U+0422><U+0426> "<U+041F><U+0443><U+0448><U+043A><U+0438><U+043D><U+0441><U+043A><U+0438><U+0439>" 64953921.6
21<U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+041C><U+0422><U+0420><U+0426> "<U+0410><U+0444><U+0438> <U+041C><U+043E><U+043B><U+043B>" 62581387.8
43<U+0421><U+041F><U+0431> <U+0422><U+041A> "<U+0421><U+0435><U+043D><U+043D><U+0430><U+044F>" 62217699.4
22<U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+041C><U+0430><U+0433><U+0430><U+0437><U+0438><U+043D> <U+0421>21 60438890.2
56<U+0427><U+0435><U+0445><U+043E><U+0432> <U+0422><U+0420><U+0426> "<U+041A><U+0430><U+0440><U+043D><U+0430><U+0432><U+0430><U+043B>" 58605903.5
16<U+041A><U+043E><U+043B><U+043E><U+043C><U+043D><U+0430> <U+0422><U+0426> "<U+0420><U+0438><U+043E>" 57436486.6
29<U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0426> "<U+041D><U+043E><U+0432><U+044B><U+0439> <U+0432><U+0435><U+043A>" (<U+041D><U+043E><U+0432><U+043E><U+043A><U+043E><U+0441><U+0438><U+043D><U+043E>) 56550135.5
53<U+0423><U+0444><U+0430> <U+0422><U+0426> "<U+0421><U+0435><U+043C><U+044C><U+044F>" 2 56511041.7
55<U+0426><U+0438><U+0444><U+0440><U+043E><U+0432><U+043E><U+0439> <U+0441><U+043A><U+043B><U+0430><U+0434> 1<U+0421>-<U+041E><U+043D><U+043B><U+0430><U+0439><U+043D> 49792061.5
14<U+041A><U+0430><U+0437><U+0430><U+043D><U+044C> <U+0422><U+0426> "<U+041F><U+0430><U+0440><U+043A><U+0425><U+0430><U+0443><U+0441>" II 46737281.5
30<U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0426> "<U+041F><U+0435><U+0440><U+043B><U+043E><U+0432><U+0441><U+043A><U+0438><U+0439>" 46702461.4
41<U+0420><U+043E><U+0441><U+0442><U+043E><U+0432><U+041D><U+0430><U+0414><U+043E><U+043D><U+0443> <U+0422><U+0426> "<U+041C><U+0435><U+0433><U+0430>" 46092990.5
37<U+041D><U+043E><U+0432><U+043E><U+0441><U+0438><U+0431><U+0438><U+0440><U+0441><U+043A> <U+0422><U+0426> "<U+041C><U+0435><U+0433><U+0430>" 45974685.0
59<U+042F><U+0440><U+043E><U+0441><U+043B><U+0430><U+0432><U+043B><U+044C> <U+0422><U+0426> "<U+0410><U+043B><U+044C><U+0442><U+0430><U+0438><U+0440>" 45226578.9
52<U+0423><U+0444><U+0430> <U+0422><U+041A> "<U+0426><U+0435><U+043D><U+0442><U+0440><U+0430><U+043B><U+044C><U+043D><U+044B><U+0439>" 45054241.7
2 <U+0410><U+0434><U+044B><U+0433><U+0435><U+044F> <U+0422><U+0426> "<U+041C><U+0435><U+0433><U+0430>" 44049638.8
45<U+0421><U+0430><U+043C><U+0430><U+0440><U+0430> <U+0422><U+0426> "<U+041F><U+0430><U+0440><U+043A><U+0425><U+0430><U+0443><U+0441>" 40924797.5
4 <U+0412><U+043E><U+043B><U+0436><U+0441><U+043A><U+0438><U+0439> <U+0422><U+0426> "<U+0412><U+043E><U+043B><U+0433><U+0430> <U+041C><U+043E><U+043B><U+043B>" 40539650.3
5 <U+0412><U+043E><U+043B><U+043E><U+0433><U+0434><U+0430> <U+0422><U+0420><U+0426> "<U+041C><U+0430><U+0440><U+043C><U+0435><U+043B><U+0430><U+0434>" 38160429.0
44<U+0421><U+0430><U+043C><U+0430><U+0440><U+0430> <U+0422><U+0426> "<U+041C><U+0435><U+043B><U+043E><U+0434><U+0438><U+044F>" 34204028.3
3 <U+0411><U+0430><U+043B><U+0430><U+0448><U+0438><U+0445><U+0430> <U+0422><U+0420><U+041A> "<U+041E><U+043A><U+0442><U+044F><U+0431><U+0440><U+044C>-<U+041A><U+0438><U+043D><U+043E><U+043C><U+0438><U+0440>" 30140853.9
17<U+041A><U+0440><U+0430><U+0441><U+043D><U+043E><U+044F><U+0440><U+0441><U+043A> <U+0422><U+0426> "<U+0412><U+0437><U+043B><U+0435><U+0442><U+043A><U+0430> <U+041F><U+043B><U+0430><U+0437><U+0430>" 27784948.3
48<U+0422><U+043E><U+043C><U+0441><U+043A> <U+0422><U+0420><U+0426> "<U+0418><U+0437><U+0443><U+043C><U+0440><U+0443><U+0434><U+043D><U+044B><U+0439> <U+0413><U+043E><U+0440><U+043E><U+0434>" 27714110.0
51<U+0422><U+044E><U+043C><U+0435><U+043D><U+044C> <U+0422><U+0426> "<U+0417><U+0435><U+043B><U+0435><U+043D><U+044B><U+0439> <U+0411><U+0435><U+0440><U+0435><U+0433>" 24205161.2
49<U+0422><U+044E><U+043C><U+0435><U+043D><U+044C> <U+0422><U+0420><U+0426> "<U+041A><U+0440><U+0438><U+0441><U+0442><U+0430><U+043B><U+043B>" 20140923.5
10<U+0416><U+0443><U+043A><U+043E><U+0432><U+0441><U+043A><U+0438><U+0439> <U+0443><U+043B>. <U+0427><U+043A><U+0430><U+043B><U+043E><U+0432><U+0430> 39<U+043C>? 19410283.4
39<U+0420><U+043E><U+0441><U+0442><U+043E><U+0432><U+041D><U+0430><U+0414><U+043E><U+043D><U+0443> <U+0422><U+0420><U+041A> "<U+041C><U+0435><U+0433><U+0430><U+0446><U+0435><U+043D><U+0442><U+0440> <U+0413><U+043E><U+0440><U+0438><U+0437><U+043E><U+043D><U+0442>" 18092891.0
9 <U+0412><U+044B><U+0435><U+0437><U+0434><U+043D><U+0430><U+044F> <U+0422><U+043E><U+0440><U+0433><U+043E><U+0432><U+043B><U+044F> 16372837.0
34<U+041D>.<U+041D><U+043E><U+0432><U+0433><U+043E><U+0440><U+043E><U+0434> <U+0422><U+0420><U+0426> "<U+0420><U+0418><U+041E>" 8582822.0
0 !<U+042F><U+043A><U+0443><U+0442><U+0441><U+043A> <U+041E><U+0440><U+0434><U+0436><U+043E><U+043D><U+0438><U+043A><U+0438><U+0434><U+0437><U+0435>, 56 <U+0444><U+0440><U+0430><U+043D> 6637370.0
20<U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> "<U+0420><U+0430><U+0441><U+043F><U+0440><U+043E><U+0434><U+0430><U+0436><U+0430>" 6599924.0
13<U+041A><U+0430><U+0437><U+0430><U+043D><U+044C> <U+0422><U+0426> "<U+0411><U+0435><U+0445><U+0435><U+0442><U+043B><U+0435>" 6006173.2
33<U+041C><U+044B><U+0442><U+0438><U+0449><U+0438> <U+0422><U+0420><U+041A> "XL-3" 5949101.6
32<U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0426> "<U+0421><U+0435><U+0440><U+0435><U+0431><U+0440><U+044F><U+043D><U+044B><U+0439> <U+0414><U+043E><U+043C>" 5772824.0
23<U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+041A> "<U+0411><U+0443><U+0434><U+0435><U+043D><U+043E><U+0432><U+0441><U+043A><U+0438><U+0439>" (<U+043F><U+0430><U+0432>.<U+0410>2) 5621521.2
40<U+0420><U+043E><U+0441><U+0442><U+043E><U+0432><U+041D><U+0430><U+0414><U+043E><U+043D><U+0443> <U+0422><U+0420><U+041A> "<U+041C><U+0435><U+0433><U+0430><U+0446><U+0435><U+043D><U+0442><U+0440> <U+0413><U+043E><U+0440><U+0438><U+0437><U+043E><U+043D><U+0442>" <U+041E><U+0441><U+0442><U+0440><U+043E><U+0432><U+043D><U+043E><U+0439> 4293586.8
1 !<U+042F><U+043A><U+0443><U+0442><U+0441><U+043A> <U+0422><U+0426> "<U+0426><U+0435><U+043D><U+0442><U+0440><U+0430><U+043B><U+044C><U+043D><U+044B><U+0439>" <U+0444><U+0440><U+0430><U+043D> 3238207.0
8 <U+0412><U+043E><U+0440><U+043E><U+043D><U+0435><U+0436> <U+0422><U+0420><U+0426> <U+0421><U+0438><U+0442><U+0438>-<U+041F><U+0430><U+0440><U+043A> "<U+0413><U+0440><U+0430><U+0434>" 2349358.0
11<U+0416><U+0443><U+043A><U+043E><U+0432><U+0441><U+043A><U+0438><U+0439> <U+0443><U+043B>. <U+0427><U+043A><U+0430><U+043B><U+043E><U+0432><U+0430> 39<U+043C>² 521655.1
36<U+041D><U+043E><U+0432><U+043E><U+0441><U+0438><U+0431><U+0438><U+0440><U+0441><U+043A> <U+0422><U+0420><U+0426> "<U+0413><U+0430><U+043B><U+0435><U+0440><U+0435><U+044F> <U+041D><U+043E><U+0432><U+043E><U+0441><U+0438><U+0431><U+0438><U+0440><U+0441><U+043A>" 377714.0
In [36]:
# Plot for shop vs sales volume

plot_ly(sales_volume, y=~reorder(as.factor(shop_id) , sales_vol), x=~sales_vol, color=~as.factor(shop_id), type='bar', hoverinfo='text', text=~paste('Shop',shop_id,':',round(sales_vol)), orientation='h') %>%
    layout(title="Sales Volume", yaxis=list(title="Shop"), xaxis=list(title="Sales Volume(Amount)"), legend=list(title=list(text='<b>Shop ID</b>')))

After all the visualisation we can determine that 'month' and 'item_category_id' are important features and should be considered to be used for training our model.

In [37]:
# We will now convert our dataframe into a data table which makes it easier for further process

library(data.table) 

train_datatable = as.data.table(merged_sales)
monthly_shop_data = train_datatable[, list(item_cnt_month=(sum(item_cnt_day))/12), by = c("date_block_num", "month","shop_id", "item_category_id", "item_id", "item_price")]
summary(monthly_shop_data)
head(monthly_shop_data)
nrow(monthly_shop_data)
Attaching package: 'data.table'


The following objects are masked from 'package:lubridate':

    hour, isoweek, mday, minute, month, quarter, second, wday, week,
    yday, year


The following objects are masked from 'package:dplyr':

    between, first, last


The following object is masked from 'package:purrr':

    transpose


 date_block_num      month           shop_id        item_category_id
 Min.   : 0.00   1      :175576   31     : 109408   40     :343281  
 1st Qu.: 6.00   3      :167065   25     :  95955   55     :249575  
 Median :14.00   2      :163914   54     :  75612   30     :152805  
 Mean   :14.75   8      :152691   28     :  70733   37     :143685  
 3rd Qu.:23.00   6      :145226   57     :  64235   19     :113445  
 Max.   :33.00   7      :144487   42     :  62337   23     : 86916  
                 (Other):790063   (Other):1260742   (Other):649315  
    item_id          item_price       item_cnt_month     
 17717  :   7475   Min.   :    -1.0   Min.   : -1.83333  
 7893   :   2009   1st Qu.:   199.0   1st Qu.:  0.08333  
 2445   :   1993   Median :   399.0   Median :  0.08333  
 4244   :   1941   Mean   :   845.8   Mean   :  0.17482  
 6675   :   1856   3rd Qu.:   899.0   3rd Qu.:  0.16667  
 4248   :   1833   Max.   :307980.0   Max.   :180.75000  
 (Other):1721915                                         
A data.table: 6 × 7
date_block_nummonthshop_iditem_category_iditem_iditem_priceitem_cnt_month
<dbl><fct><fct><fct><fct><dbl><dbl>
01593722154 999.000.08333333
0125582552 899.000.00000000
0125582554 1709.050.08333333
0125562555 1099.000.08333333
0125592564 349.000.08333333
0125562565 549.000.08333333
1739022
In [38]:
memory.size()
memory.limit()
memory.limit(size=56000)
1313.72
8084
56000

We will divide our data into training(70%) and test data(30%)

In [39]:
library(caret)
set.seed(100)

# Divinding the data in 70/30 ratio into training and testing data
summarized_data = createDataPartition(y = monthly_shop_data$item_cnt_month, p = 0.7, list = F)
sum_training = monthly_shop_data[summarized_data, ]
testing = monthly_shop_data[-summarized_data, ]
Loading required package: lattice


Attaching package: 'caret'


The following object is masked from 'package:purrr':

    lift


We further divide the training data into training(80%) and validation(20%) data

This validation data helps us determine weather the trained model is good or not.

In [40]:
set.seed(100)

# Divinding the training data further in 80/20 ratio into training and validation data
training_data = createDataPartition(y = sum_training$item_cnt_month, p = 0.8, list = F)
training = sum_training[training_data,]
validation = sum_training[-training_data,]
In [41]:
nrow(training)
nrow(validation)
nrow(testing)
973855
243462
521705

After trying and evaluating different models and different fetures we would decide to choose 'date_block_num', 'month', 'shop_id' and 'item_category_id' as our features as per the observations.

Training

In [42]:
model = train(item_cnt_month~date_block_num+month+shop_id+item_category_id, data=training, method="glm", trControl=trainControl(method="cv",number=5))

We are using the glm method for training and for trainControl we are using 5-fold cross validation.

In [43]:
summary(model)
Call:
NULL

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-12.588   -0.087   -0.034    0.016  179.965  

Coefficients: (2 not defined because of singularities)
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)         1.240e-01  3.865e-01   0.321 0.748275    
date_block_num     -1.157e-03  6.477e-05 -17.870  < 2e-16 ***
month2             -4.791e-03  2.511e-03  -1.908 0.056424 .  
month3              2.880e-03  2.507e-03   1.149 0.250607    
month4             -4.369e-03  2.627e-03  -1.663 0.096294 .  
month5             -1.238e-02  2.633e-03  -4.702 2.57e-06 ***
month6             -7.247e-03  2.613e-03  -2.773 0.005553 ** 
month7             -1.084e-02  2.622e-03  -4.132 3.59e-05 ***
month8             -1.066e-02  2.593e-03  -4.111 3.93e-05 ***
month9              1.467e-02  2.698e-03   5.435 5.47e-08 ***
month10             1.470e-02  2.729e-03   5.384 7.27e-08 ***
month11             3.087e-02  2.897e-03  10.659  < 2e-16 ***
month12             4.367e-02  2.659e-03  16.426  < 2e-16 ***
shop_id1           -4.429e-02  1.664e-02  -2.662 0.007779 ** 
shop_id2           -9.009e-02  1.205e-02  -7.477 7.59e-14 ***
shop_id3           -9.635e-02  1.197e-02  -8.049 8.37e-16 ***
shop_id4           -7.553e-02  1.157e-02  -6.527 6.71e-11 ***
shop_id5           -7.858e-02  1.159e-02  -6.777 1.23e-11 ***
shop_id6           -3.759e-02  1.112e-02  -3.380 0.000725 ***
shop_id7           -5.579e-02  1.132e-02  -4.928 8.29e-07 ***
shop_id8           -8.616e-02  1.783e-02  -4.833 1.34e-06 ***
shop_id9            4.151e-01  2.014e-02  20.610  < 2e-16 ***
shop_id10          -1.076e-01  1.214e-02  -8.864  < 2e-16 ***
shop_id11          -9.053e-02  3.653e-02  -2.478 0.013212 *  
shop_id12           7.741e-03  1.176e-02   0.659 0.510194    
shop_id13          -7.330e-02  1.249e-02  -5.871 4.33e-09 ***
shop_id14          -6.273e-02  1.161e-02  -5.401 6.63e-08 ***
shop_id15          -4.282e-02  1.134e-02  -3.777 0.000159 ***
shop_id16          -6.473e-02  1.135e-02  -5.701 1.19e-08 ***
shop_id17          -7.867e-02  1.211e-02  -6.496 8.23e-11 ***
shop_id18          -4.786e-02  1.136e-02  -4.214 2.51e-05 ***
shop_id19          -5.227e-02  1.126e-02  -4.644 3.42e-06 ***
shop_id20           3.378e-01  2.693e-02  12.541  < 2e-16 ***
shop_id21          -5.995e-02  1.126e-02  -5.323 1.02e-07 ***
shop_id22          -6.554e-02  1.153e-02  -5.685 1.31e-08 ***
shop_id23          -6.546e-02  1.512e-02  -4.328 1.51e-05 ***
shop_id24          -4.813e-02  1.140e-02  -4.220 2.44e-05 ***
shop_id25           2.060e-02  1.090e-02   1.890 0.058731 .  
shop_id26          -5.195e-02  1.132e-02  -4.587 4.49e-06 ***
shop_id27           1.083e-02  1.109e-02   0.977 0.328772    
shop_id28           2.207e-02  1.099e-02   2.009 0.044569 *  
shop_id29          -4.919e-02  1.145e-02  -4.297 1.73e-05 ***
shop_id30          -5.416e-02  1.135e-02  -4.771 1.83e-06 ***
shop_id31           4.827e-02  1.087e-02   4.442 8.91e-06 ***
shop_id32          -8.848e-02  1.432e-02  -6.177 6.53e-10 ***
shop_id33          -1.021e-01  1.615e-02  -6.320 2.61e-10 ***
shop_id34          -1.275e-01  1.537e-02  -8.295  < 2e-16 ***
shop_id35          -8.335e-02  1.126e-02  -7.402 1.34e-13 ***
shop_id36          -1.676e-01  4.567e-02  -3.669 0.000243 ***
shop_id37          -6.952e-02  1.151e-02  -6.038 1.56e-09 ***
shop_id38          -6.764e-02  1.147e-02  -5.895 3.74e-09 ***
shop_id39          -8.867e-02  1.288e-02  -6.884 5.82e-12 ***
shop_id40          -8.852e-02  1.860e-02  -4.760 1.94e-06 ***
shop_id41          -7.089e-02  1.152e-02  -6.154 7.58e-10 ***
shop_id42          -3.895e-03  1.104e-02  -0.353 0.724140    
shop_id43          -4.475e-02  1.163e-02  -3.847 0.000119 ***
shop_id44          -7.637e-02  1.150e-02  -6.642 3.10e-11 ***
shop_id45          -7.786e-02  1.162e-02  -6.703 2.05e-11 ***
shop_id46          -4.219e-02  1.125e-02  -3.750 0.000177 ***
shop_id47          -5.077e-02  1.135e-02  -4.474 7.68e-06 ***
shop_id48          -6.718e-02  1.229e-02  -5.465 4.63e-08 ***
shop_id49          -1.021e-01  1.263e-02  -8.089 6.03e-16 ***
shop_id50          -4.213e-02  1.126e-02  -3.742 0.000182 ***
shop_id51          -6.771e-02  1.142e-02  -5.931 3.02e-09 ***
shop_id52          -7.226e-02  1.146e-02  -6.305 2.89e-10 ***
shop_id53          -5.786e-02  1.140e-02  -5.077 3.83e-07 ***
shop_id54           1.369e-02  1.095e-02   1.250 0.211249    
shop_id55           1.129e+01  1.143e-01  98.761  < 2e-16 ***
shop_id56          -5.664e-02  1.117e-02  -5.071 3.95e-07 ***
shop_id57          -1.055e-02  1.103e-02  -0.956 0.338989    
shop_id58          -4.030e-02  1.125e-02  -3.583 0.000340 ***
shop_id59          -7.366e-02  1.153e-02  -6.388 1.68e-10 ***
item_category_id1          NA         NA      NA       NA    
item_category_id2   5.886e-02  3.866e-01   0.152 0.878979    
item_category_id3   1.644e-01  3.866e-01   0.425 0.670735    
item_category_id4   6.264e-02  3.870e-01   0.162 0.871422    
item_category_id5   3.656e-02  3.867e-01   0.095 0.924672    
item_category_id6   9.355e-02  3.866e-01   0.242 0.808806    
item_category_id7   7.516e-02  3.868e-01   0.194 0.845910    
item_category_id8   1.558e+00  3.882e-01   4.014 5.96e-05 ***
item_category_id9   6.766e-01  3.869e-01   1.749 0.080312 .  
item_category_id10         NA         NA      NA       NA    
item_category_id11  3.860e-02  3.867e-01   0.100 0.920491    
item_category_id12  2.768e-01  3.868e-01   0.716 0.474127    
item_category_id13  1.831e-02  3.889e-01   0.047 0.962446    
item_category_id14  2.769e-02  3.868e-01   0.072 0.942943    
item_category_id15  5.366e-02  3.867e-01   0.139 0.889648    
item_category_id16  5.917e-02  3.869e-01   0.153 0.878468    
item_category_id17  5.909e-03  3.888e-01   0.015 0.987873    
item_category_id18  7.367e-02  4.574e-01   0.161 0.872027    
item_category_id19  1.187e-01  3.865e-01   0.307 0.758776    
item_category_id20  2.589e-01  3.866e-01   0.670 0.502971    
item_category_id21  5.238e-02  3.866e-01   0.136 0.892209    
item_category_id22  6.543e-02  3.866e-01   0.169 0.865606    
item_category_id23  9.445e-02  3.865e-01   0.244 0.806970    
item_category_id24  7.239e-02  3.866e-01   0.187 0.851462    
item_category_id25  4.338e-02  3.866e-01   0.112 0.910653    
item_category_id26 -1.094e+01  4.286e-01 -25.518  < 2e-16 ***
item_category_id27 -1.130e+01  5.118e-01 -22.079  < 2e-16 ***
item_category_id28  1.921e-01  3.866e-01   0.497 0.619237    
item_category_id29  2.149e-01  3.866e-01   0.556 0.578271    
item_category_id30  1.809e-01  3.865e-01   0.468 0.639745    
item_category_id31 -1.109e+01  4.030e-01 -27.511  < 2e-16 ***
item_category_id32  2.469e-01  3.869e-01   0.638 0.523282    
item_category_id33  1.044e-01  3.867e-01   0.270 0.787188    
item_category_id34 -6.346e+00  4.100e-01 -15.479  < 2e-16 ***
item_category_id35  3.799e-01  3.867e-01   0.983 0.325835    
item_category_id36 -1.118e+01  4.713e-01 -23.727  < 2e-16 ***
item_category_id37  3.952e-02  3.865e-01   0.102 0.918569    
item_category_id38  3.783e-02  3.866e-01   0.098 0.922047    
item_category_id39  2.575e-02  3.879e-01   0.066 0.947075    
item_category_id40  7.569e-02  3.865e-01   0.196 0.844749    
item_category_id41  2.893e-02  3.866e-01   0.075 0.940336    
item_category_id42  2.162e-01  3.871e-01   0.559 0.576453    
item_category_id43  1.395e-03  3.866e-01   0.004 0.997121    
item_category_id44 -1.130e+01  4.059e-01 -27.836  < 2e-16 ***
item_category_id45 -2.143e-03  3.867e-01  -0.006 0.995579    
item_category_id46  3.667e-03  4.322e-01   0.008 0.993230    
item_category_id47  1.122e-01  3.868e-01   0.290 0.771780    
item_category_id48 -6.793e-03  4.990e-01  -0.014 0.989138    
item_category_id49  9.056e-02  3.866e-01   0.234 0.814795    
item_category_id50  1.358e-01  5.467e-01   0.248 0.803823    
item_category_id51  5.859e-02  6.695e-01   0.088 0.930255    
item_category_id52  5.183e-02  5.466e-01   0.095 0.924459    
item_category_id53 -3.406e-02  6.695e-01  -0.051 0.959418    
item_category_id54 -1.074e+01  4.037e-01 -26.603  < 2e-16 ***
item_category_id55  4.188e-02  3.865e-01   0.108 0.913716    
item_category_id56  1.612e-02  3.866e-01   0.042 0.966738    
item_category_id57  1.569e-02  3.866e-01   0.041 0.967621    
item_category_id58 -3.676e-04  3.866e-01  -0.001 0.999241    
item_category_id59 -1.082e-02  3.867e-01  -0.028 0.977678    
item_category_id60 -3.018e-03  3.870e-01  -0.008 0.993778    
item_category_id61  3.000e-02  3.866e-01   0.078 0.938149    
item_category_id62  5.092e-02  3.866e-01   0.132 0.895212    
item_category_id63  7.000e-02  3.866e-01   0.181 0.856299    
item_category_id64  5.896e-02  3.866e-01   0.153 0.878779    
item_category_id65  1.637e-01  3.866e-01   0.423 0.672006    
item_category_id66  1.042e-01  3.868e-01   0.269 0.787730    
item_category_id67  3.908e-02  3.866e-01   0.101 0.919472    
item_category_id68 -6.079e-02  3.930e-01  -0.155 0.877084    
item_category_id69  8.686e-02  3.866e-01   0.225 0.822222    
item_category_id70  1.290e-01  3.866e-01   0.334 0.738614    
item_category_id71  9.682e+00  3.870e-01  25.018  < 2e-16 ***
item_category_id72  5.562e-02  3.866e-01   0.144 0.885584    
item_category_id73  1.155e-01  3.867e-01   0.299 0.765192    
item_category_id74 -1.115e+01  4.305e-01 -25.889  < 2e-16 ***
item_category_id75  1.167e-01  3.866e-01   0.302 0.762799    
item_category_id76 -1.110e+01  4.034e-01 -27.518  < 2e-16 ***
item_category_id77  6.641e-02  3.868e-01   0.172 0.863696    
item_category_id78 -1.127e+01  4.034e-01 -27.935  < 2e-16 ***
item_category_id79  1.145e-01  3.866e-01   0.296 0.767151    
item_category_id80  1.207e+00  3.883e-01   3.108 0.001884 ** 
item_category_id81  1.662e-01  3.884e-01   0.428 0.668697    
item_category_id82  1.015e+00  3.873e-01   2.620 0.008790 ** 
item_category_id83  3.187e-01  3.868e-01   0.824 0.410037    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for gaussian family taken to be 0.2987583)

    Null deviance: 382574  on 973854  degrees of freedom
Residual deviance: 290902  on 973702  degrees of freedom
AIC: 1587302

Number of Fisher Scoring iterations: 2

Evaluation

In [44]:
# Predicting the data from validation set
p = predict(model, validation)
error = (p - validation$item_cnt_month)
In [45]:
# Comparing the RMSE error from the model and the validation set

RMSE_model = model$results$RMSE; 
cat("RMSE error of the model:",RMSE_model)

RMSE_new = sqrt(mean(error^2)); 
cat("\nRMSE error of the predicted values:",RMSE_new)
RMSE error of the model: 0.5454389
RMSE error of the predicted values: 0.5224954
In [46]:
# Predicting the test set
prediction = predict(model, testing)
In [47]:
# Determine the error in the test set
test_error = (prediction - testing$item_cnt_month)
RMSE_test = sqrt(mean(test_error^2))
cat("RMSE error for the test data:",RMSE_test)
RMSE error for the test data: 0.5833928

These are accesptable values to pass this model. Obviously, we can improve on our model to reduce the error further but that will make out model more complex which is undesiarble.

Prediction

Now we imprt the test data from the CSV file from which we have the predict the data.

In [48]:
# import test data to be predicted
test_data = read_csv("test.csv")

# Assign 11 to month for November
test_data$month = 11
test_data$month = as.factor(test_data$month)
test_data$shop_id = as.factor(test_data$shop_id)

# Assign 34 to date_block_num for Nov 2015
test_data$date_block_num = 34
head(test_data)
Parsed with column specification:
cols(
  ID = col_double(),
  shop_id = col_double(),
  item_id = col_double()
)

A tibble: 6 × 5
IDshop_iditem_idmonthdate_block_num
<dbl><fct><dbl><fct><dbl>
0550371134
1553201134
2552331134
3552321134
4552681134
5550391134
In [49]:
#combine test_data and items_data by item_id to get item_category_id

combined_test = left_join(x=test_data, y = items, by = c("item_id"))
combined_test$item_category_id = as.factor(combined_test$item_category_id)
head(combined_test)
A tibble: 6 × 7
IDshop_iditem_idmonthdate_block_numitem_nameitem_category_id
<dbl><fct><dbl><fct><dbl><chr><fct>
0550371134NHL 15 [PS3, <U+0440><U+0443><U+0441><U+0441><U+043A><U+0438><U+0435> <U+0441><U+0443><U+0431><U+0442><U+0438><U+0442><U+0440><U+044B>] 19
1553201134ONE DIRECTION Made In The A.M. 55
2552331134Need for Speed Rivals (Essentials) [PS3, <U+0440><U+0443><U+0441><U+0441><U+043A><U+0430><U+044F> <U+0432><U+0435><U+0440><U+0441><U+0438><U+044F>] 19
3552321134Need for Speed Rivals (Classics) [Xbox 360, <U+0440><U+0443><U+0441><U+0441><U+043A><U+0430><U+044F> <U+0432><U+0435><U+0440><U+0441><U+0438><U+044F>]23
4552681134Need for Speed [PS4, <U+0440><U+0443><U+0441><U+0441><U+043A><U+0430><U+044F> <U+0432><U+0435><U+0440><U+0441><U+0438><U+044F>] 20
5550391134NHL 15 [Xbox 360, <U+0440><U+0443><U+0441><U+0441><U+043A><U+0438><U+0435> <U+0441><U+0443><U+0431><U+0442><U+0438><U+0442><U+0440><U+044B>] 23
In [50]:
# Fit the model for our predictions
test_prediction = predict(model, combined_test)
In [51]:
# Prepart and write the predicted data as per the required format into a CSV file.
submit = cbind(combined_test[,1], test_prediction)
colnames(submit) = c("ID", "item_cnt_month")
class(submit)
submit$ID = as.integer(submit$ID)
head(submit)

write.csv(submit, "submission.csv", row.names = FALSE, quote = FALSE)
'data.frame'
A data.frame: 6 × 2
IDitem_cnt_month
<int><dbl>
100.15569690
210.07887593
320.15569690
430.13144073
540.29592050
650.13144073
In [ ]: